#Importing libraries
import zipfile
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
import os
import matplotlib.pyplot as plt#visualization
from PIL import Image
%matplotlib inline
import pandas as pd
import seaborn as sns#visualization
import itertools
import warnings
warnings.filterwarnings("ignore")
import io
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
import plotly.express as px
start_time = pd.datetime.now()
%%time
#load file from local zip
zf = zipfile.ZipFile('expedia-hotel-recommendations.zip')
df = pd.read_csv(zf.open('train.csv'))
#load file from s3 with pyspark
'''
def read_csv_s3(path):
import os
from pyspark.sql import SparkSession
AWS_USER = os.environ.get('AWSUSER')
AWS_PASSWORD = os.environ.get('AWSPASSWORD')
os.environ['PYSPARK_SUBMIT_ARGS'] = ("--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell")
spark = SparkSession.builder.appName('S3CSVRead').getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.awsAccessKeyId", AWS_USER)
spark._jsc.hadoopConfiguration().set("fs.s3a.awsSecretAccessKey", AWS_PASSWORD)
df= spark.read.csv(path, header=True)
return df,spark
'''
#%%time
#path="s3a://593research/train.csv"
#df,spark = read_csv_s3()
len(df)
df.head()
#randomly sample 1% from training dataset for data manipulation and training
data = df.sample(frac=0.01, random_state=99)
%%time
print ("Rows : " ,data.shape[0])
print ("Columns : " ,data.shape[1])
print ("\nFeatures : \n" ,data.columns.tolist())
print ("\nMissing values : ", data.isnull().sum().values.sum())
print ("\nUnique values : \n",data.nunique())
##missing value percentage
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_df = pd.DataFrame({'column_name': data.columns,
'percent_missing': percent_missing})
missing_value_df
data.describe()
#to check data types
data.dtypes
#change data types
columns = data.columns.tolist()
datetime_var = ['date_time','srch_ci','srch_co']
numeric_var = ['orig_destination_distance','is_mobile','is_package','srch_adults_cnt','srch_children_cnt','srch_rm_cnt',
'is_booking','cnt']
cat_var = [i for i in columns if(i not in datetime_var and i not in numeric_var)]
def change_datatype():
for i in datetime_var:
data[i] = data[i].astype('datetime64[ns]')
return data
data = change_datatype()
#check data types
data.dtypes
#define a function to plot interactive distrbution graph
def distribution_plot(dataset,column,title,xtitle,ytitle):
trace = go.Histogram(x=dataset[column], opacity=0.7, marker={"line": {"color": "#25232C"}})
layout = go.Layout(title=title, xaxis={"title": xtitle, "showgrid": False},
yaxis={"title": ytitle, "showgrid": False},plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)') #showgrid:False to remove gridline
figure = {"data": [trace], "layout": layout}
py.iplot(figure)
distribution_plot(data,'hotel_cluster',f"Hotel Cluster Distribution","Hotel Cluster","Count")
#hotel cluster counts
data['hotel_cluster'].value_counts()
plot_data = data.copy()
plot_data['is_mobile'] = plot_data["is_mobile"].replace({1:"Yes",0:"No"})
plot_data['is_package'] = plot_data["is_package"].replace({1:"Yes",0:"No"})
plot_data['is_booking'] = plot_data["is_booking"].replace({1:"Yes",0:"No"})
book = plot_data[plot_data['is_booking']=="Yes"]
notbook = plot_data[plot_data['is_booking']=="No"]
#define a function to plot distribution by booking decision
def histogram(column) :
trace1 = go.Histogram(x = book[column],
histnorm= "percent",
name = "Booked",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
trace2 = go.Histogram(x = notbook[column],
histnorm = "percent",
name = "Not Booked",
marker = dict(line = dict(width = .5,
color = "black"
)
),
opacity = .9
)
data = [trace1,trace2]
layout = go.Layout(dict(title =column + " distribution in booking attrition ",
plot_bgcolor = "rgb(243,243,243)",
paper_bgcolor = "rgb(243,243,243)",
xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = column,
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
title = "percent",
zerolinewidth=1,
ticklen=5,
gridwidth=2
),
)
)
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)
col = ['orig_destination_distance','is_mobile','is_package','channel','srch_adults_cnt','srch_children_cnt',
'srch_rm_cnt','srch_destination_type_id','hotel_continent']
for i in col:
histogram(i)
#check number of rows that have missing values
dis_na = data['orig_destination_distance'].isnull().values.ravel().sum()
print(dis_na)
def filled_distance():
dis_na = data['orig_destination_distance'].isnull().values.ravel().sum()
#first step fill by the user lovation city and srch destimation id group
#filled about 4k record, 131808 NA left
data['orig_destination_distance'] = data.groupby(['user_location_city','srch_destination_id'])['orig_destination_distance'].transform(
lambda x: x.fillna(np.mean(x)))
dis_na1 = data['orig_destination_distance'].isnull().values.ravel().sum()
print('Filled {} records and there are {} NA records left'.format(dis_na-dis_na1, dis_na1))
#second step fill by the user location country and hotel country group
#filled 64651 records and 67157 left
data['orig_destination_distance'] = data.groupby(['user_location_city','hotel_country'])['orig_destination_distance'].transform(
lambda x: x.fillna(np.mean(x)))
dis_na2 = data['orig_destination_distance'].isnull().values.ravel().sum()
print('Filled {} records and there are {} NA records left'.format(dis_na1-dis_na2, dis_na2))
#third step fill by the user location country and search destination type id group
#filled 32507 records and 34650 left
data['orig_destination_distance'] = data.groupby(['user_location_country','srch_destination_type_id'])['orig_destination_distance'].transform(
lambda x: x.fillna(np.mean(x)))
dis_na3 = data['orig_destination_distance'].isnull().values.ravel().sum()
print('Filled {} records and there are {} NA records left'.format(dis_na2-dis_na3, dis_na3))
#forth step fill by the posa continent group
#filled all the missing value
data['orig_destination_distance'] = data.groupby(['posa_continent'])['orig_destination_distance'].transform(
lambda x: x.fillna(np.mean(x)))
dis_na4 = data['orig_destination_distance'].isnull().values.ravel().sum()
print('Filled {} records and there are {} NA records left'.format(dis_na3-dis_na4, dis_na4))
return data
filled_distance()
print('duration: ',pd.datetime.now() - start_time)
target = ['hotel_cluster']
userid = ['user_id']
cat_var = [i for i in cat_var if i not in target]+userid
print(target)
print(cat_var)
#train test split
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size =.25 ,random_state = 111)
cols = [i for i in data.columns if i not in target]
train_X = train[cols]
train_Y = train[target]
test_X = test[cols]
test_Y = test[target]
train1 = train.copy()
def target_encoder(df,df_toencoded, column, target, method='mean'):
if method == 'mean':
df1 = df.groupby(column)[target].mean().reset_index()
elif method == 'median':
df1 = df.groupby(column)[target].median().reset_index()
elif method == 'std':
df1 = df.groupby(column)[target].std().reset_index()
elif method == 'mode':
df1 = df.groupby(column)[target].apply(pd.Series.mode).reset_index()
else:
raise ValueError("Incorrect method supplied: '{}'. Must be one of 'mean', 'median', 'std','mode'".format(method))
encode_dict = {k:v for k, v in zip(df1[column],df1[target])}
encoded_column = [encode_dict[k] for k in df_toencoded[column]]
return encoded_column
encode_col = ['hotel_country']
#for i in encode_col:
# train[i] = target_encoder(train1,train,i,'hotel_cluster','mode')
#train.head()
##previous hotel cluster
train_user = set(train.user_id)
test_user = set(test.user_id)
new_user = test_user-train_user
new_user = list(new_user)
train['rec_hotel_cluster'] = train.groupby(['user_id'])['hotel_cluster'].transform(
lambda x:pd.Series.mode(x)[0])
#for existing user, match rec_hotel_cluster
#for new_user, find similar user
data[data['user_id'].isin(train_user)]
#get unique user_id record and plot kmeans elbow plot to find optimal number of clusters
test['earlist'] = test.groupby(['user_id'])['date_time'].transform(
lambda x:x.min())
test_df = test[test['date_time']==test['earlist']]
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
X_df = test_df[['site_name','user_location_country','hotel_country','hotel_market','is_booking','user_id']]
X_df = X_df.set_index('user_id')
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X_df)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(X_df)
X_df['cluster'] = pred_y
X_df = X_df.reset_index()
cluster_0 = list(set(X_df[X_df['cluster']==0]['user_id']))
cluster_1 = list(set(X_df[X_df['cluster']==1]['user_id']))
cluster_2 = list(set(X_df[X_df['cluster']==2]['user_id']))
def rec_cluster():
#find exisiting user then match the rec_hotel_cluster value based on the train data
cluster_dict = {k:v for k,v in zip(train['user_id'],train['rec_hotel_cluster'])}
df_old = test[test['user_id'].isin(train_user)]
df_new = test[~test['user_id'].isin(train_user)]
col_old = [cluster_dict[k] for k in df_old['user_id']]
df_old['rec_hotel_cluster'] = col_old
#for new user, find mode in similar cluster
df_new0 = df_new[df_new['user_id'].isin(cluster_0)]
df_new0['rec_hotel_cluster'] = df_new0['hotel_cluster'].mode()[0]
df_new1 = df_new[df_new['user_id'].isin(cluster_1)]
df_new1['rec_hotel_cluster'] = df_new1['hotel_cluster'].mode()[0]
df_new2 = df_new[df_new['user_id'].isin(cluster_2)]
df_new2['rec_hotel_cluster'] = df_new2['hotel_cluster'].mode()[0]
frames = [train,df_old, df_new0, df_new1,df_new2]
result = pd.concat(frames)
return result
df_new = test[~test['user_id'].isin(train_user)]
df_new0 = df_new[df_new['user_id'].isin(cluster_0)]
df_new0
data = rec_cluster()
data = data.drop(['earlist'], axis=1)
data.to_csv ('data.csv')
print('duration: ',pd.datetime.now() - start_time)